library(readr)
library(ggplot2)
news <- read_csv("newsdiet.csv")
all <- read_csv("alldiet.csv")

NEWS CLUSTERING

newscols <- 4:96
news[,newscols] <- log(news[,newscols]+1) # columns have huge right tails
news[,newscols] <- apply(news[,newscols],2,scale)
newscols <- c(4:37,39:96) # drop EAmericanNow since entire column is zeroes

drop <- c(7,10,18,21,35,37,39,44,51,77,90,91,96)
drop <- c(drop,57,58,72,78)
num.clusters <- 2:20
F <- double(length(num.clusters))
Rsqr <- double(length(num.clusters))
sse <- double(length(num.clusters))
for (i in num.clusters) {
  temp <- kmeans(news[,-c(1:3,38,97:106,drop)], centers = i)
  summary(temp)
  F[i-num.clusters[1]+1] <- invisible(summary(temp)$F)
  Rsqr[i-num.clusters[1]+1] <- invisible(summary(temp)$Rsqr)
  sse[i-num.clusters[1]+1] <- sum(temp$withinss)
}
plot(num.clusters,F, main="Pseudo F")
lines(num.clusters,F)

plot(num.clusters,Rsqr, main="R-squared")
lines(num.clusters,Rsqr)

plot(num.clusters,sse,main="Screeplot SSE vs. # Clusters")
lines(num.clusters,sse)

news.kmeans <- kmeans(news[,-c(1:3,38,97:106,drop)], centers = 10, iter.max = 50, nstart = 10)
news.summary <- summary(news.kmeans)
news.summary[1:4]
## $sse
## [1] 16966792
## 
## $ssb
## [1] 9342158
## 
## $Rsqr
## [1] 0.3550943
## 
## $F
## [1] 21460.31
plot(news.kmeans)

news.cluster.names <- c("Avoiders", "Finance", "CBS/ABC/Ent", "Mid Fox", "MSNBC/CNN", "Sports", "High Fox", "Low All", "CNN", "Hounds")

NON-NEWS CLUSTERING

networks <- read_csv("networks2.csv",col_names=FALSE)
allcols <- c(4:204)
all[,allcols] <- log(all[,allcols]+1) # columns have huge right tails
all[,allcols] <- apply(all[,allcols],2,scale)
dropnews <- c(9:10,20:32,38,139:204) # drop news shows & aggregated genres
drop2 <- c(4,37,39:41,46,57,64,67,71,77:78,97:98,100,112,114:116,124,126,129,138) # dropped from PCA/alpha
all.reorder <- all[,-c(1:3,dropnews,drop2)]
all.reorder <- all.reorder[networks$X1]
names(all.reorder) <- paste(networks$X2, names(all.reorder), sep=" ")
all.reorder <- cbind(all[,1:3],all.reorder)
num.clusters <- 2:20
F.all <- double(length(num.clusters))
Rsqr.all <- double(length(num.clusters))
sse.all <- double(length(num.clusters))
for (i in num.clusters) {
  temp <- kmeans(all.reorder[,-c(1:3)], centers = i)
  summary(temp)
  F.all[i-num.clusters[1]+1] <- invisible(summary(temp)$F)
  Rsqr.all[i-num.clusters[1]+1] <- invisible(summary(temp)$Rsqr)
  sse.all[i-num.clusters[1]+1] <- sum(temp$withinss)
}
plot(num.clusters,F.all, main="Pseudo F")
lines(num.clusters,F.all)

plot(num.clusters,Rsqr.all, main="R-squared")
lines(num.clusters,Rsqr.all)

plot(num.clusters,sse.all,main="Screeplot SSE vs. # Clusters")
lines(num.clusters,sse.all)

all.kmeans <- kmeans(all.reorder[,-c(1:3)], centers = 10, iter.max = 50, nstart = 10) #
all.summary <- summary(all.kmeans)
all.summary[1:4]
## $sse
## [1] 24910516
## 
## $ssb
## [1] 8764940
## 
## $Rsqr
## [1] 0.2602768
## 
## $F
## [1] 13713.71
plot(all.kmeans) #

all.cluster.names <- c("Westerns/Old Shows", "Sitcoms", "Animated", "Talk Shows/Daytime Shows", "Avoiders", "HGTV/Talk Shows", "Above Avg All Shows", "Disney/CN (Kids)", "Below Avg All Shows", "HGTV Only")

CLUSTER CROSSTAB

news.clusters <- cbind(household_id=news$household_id,news.clusters=news.kmeans$cluster)
all.clusters <- cbind(household_id=all$household_id,all.clusters=all.kmeans$cluster)
clusters <- merge(all.clusters,news.clusters)
count.per.cluster <- table(clusters[,2:3])
pct.per.cluster <- round(count.per.cluster/nrow(clusters)*100,3)
pct.per.col <- apply(count.per.cluster,2,function(x) round(x/sum(x)*100,3))
count.per.cluster
##             news.clusters
## all.clusters     1     2     3     4     5     6     7     8     9    10
##           1   5662   666  2005  2745   860  2282  3100  6730  2366  1456
##           2  10782   957  1557  3288  1023  7118  2240  6749  2825  1620
##           3   5636   753  3483  2410   700  5258  1353  8181  2916  1214
##           4   2034  1015  8633  1651  1517  2153  2408  8197  4004  2075
##           5  31653   866   110  4389   945  3979  3478  2754  1638   666
##           6   2504  1567  6475  2420  1322  2705  3004  7914  3619  2407
##           7   2957   745  4669  1564   982  1788  1290  4767  3048  2800
##           8  15016   346   428  2138   425  5160  1075  5208  1450   349
##           9   6770  1648  2786  3877  1771  5409  4497 11884  3763  1547
##           10  8132  1240  1006  4170  1066  4198  4078  7130  2386  1187
pct.per.cluster
##             news.clusters
## all.clusters     1     2     3     4     5     6     7     8     9    10
##           1  1.614 0.190 0.572 0.783 0.245 0.651 0.884 1.919 0.674 0.415
##           2  3.074 0.273 0.444 0.937 0.292 2.029 0.639 1.924 0.805 0.462
##           3  1.607 0.215 0.993 0.687 0.200 1.499 0.386 2.332 0.831 0.346
##           4  0.580 0.289 2.461 0.471 0.432 0.614 0.686 2.337 1.141 0.592
##           5  9.023 0.247 0.031 1.251 0.269 1.134 0.991 0.785 0.467 0.190
##           6  0.714 0.447 1.846 0.690 0.377 0.771 0.856 2.256 1.032 0.686
##           7  0.843 0.212 1.331 0.446 0.280 0.510 0.368 1.359 0.869 0.798
##           8  4.281 0.099 0.122 0.609 0.121 1.471 0.306 1.485 0.413 0.099
##           9  1.930 0.470 0.794 1.105 0.505 1.542 1.282 3.388 1.073 0.441
##           10 2.318 0.353 0.287 1.189 0.304 1.197 1.163 2.033 0.680 0.338

CHI-SQUARE TEST

chisq.test(count.per.cluster)$stdres
##             news.clusters
## all.clusters           1           2           3           4           5           6           7           8           9
##           1  -22.4934375  -4.2765928 -10.3193188  10.6780573   0.6158863 -17.6716554  23.4395881  18.8992268   3.2253665
##           2   10.7209894  -3.5987291 -34.9179459   3.3897620  -4.1563242  47.0837715 -13.2338876 -11.0571145  -4.4508726
##           3  -35.5324177  -4.9372278  13.4119350  -4.2000030  -9.0877100  29.8281154 -23.5271471  27.3804204   7.9722529
##           4  -87.7979820   2.5587067 113.6454475 -23.0271529  16.6622366 -30.5086237  -3.0145320  21.8713504  27.7696856
##           5  203.3375743 -15.8963274 -73.9459810   4.6720902 -16.3440552 -26.9878096  -6.1618149 -87.4800131 -42.4708444
##           6  -82.2332145  21.4372480  69.4955417  -7.3397679   9.8522230 -21.0067969   9.4632169  17.0344090  19.1456158
##           7  -51.8165431   2.2964314  57.7130382 -10.7681947   9.1694350 -21.2391834 -14.2717401  -1.8219115  26.3993592
##           8   91.5384503 -19.2138201 -49.2992233  -9.5323110 -18.2753808  28.7959349 -29.3110145 -15.5804053 -23.3507037
##           9  -54.0800814  12.9881133 -20.0301980   5.3451931  13.1466400   6.2692045  22.6440002  40.6124932   4.7570942
##           10 -11.0589807   9.3895530 -41.1309330  27.8003017   0.6478032   4.4242665  31.3261380   3.9047078  -7.8697092
##             news.clusters
## all.clusters          10
##           1    7.2903702
##           2   -1.2373737
##           3   -5.1556355
##           4   16.9269144
##           5  -36.2168584
##           6   25.8443264
##           7   55.8003878
##           8  -29.7507567
##           9   -9.2994629
##           10  -8.9746885
df.heatmap <- expand.grid(all.clusters=all.cluster.names, news.clusters=news.cluster.names)
m <- as.matrix(pct.per.cluster)
q <- c()
for (i in 1:ncol(m)) q <- c(q,m[,i])
df.heatmap$proportion <- as.numeric(q)
df.heatmap$labels <- paste(df.heatmap$proportion,"%", sep="")

Heatmap of overall proportions (sum of all tiles is 100%)

ggplot(data = df.heatmap, aes(x = news.clusters, y = all.clusters)) +
  geom_tile(aes(fill = proportion)) +
  scale_fill_continuous(low = "white", high = "blue", limits=c(0,10)) +
  scale_x_discrete(position = "top") +
  geom_text(aes(label = labels))

Heatmap of within-news-clusters proportions (each column sums to 100%)

df.heatmap2 <- expand.grid(all.clusters=all.cluster.names, news.clusters=news.cluster.names)
m <- as.matrix(pct.per.col)
q <- c()
for (i in 1:ncol(m)) q <- c(q,m[,i])
df.heatmap2$proportion <- as.numeric(q)
df.heatmap2$labels <- paste(df.heatmap2$proportion,"%", sep="")
ggplot(data = df.heatmap2, aes(x = news.clusters, y = all.clusters)) +
  geom_tile(aes(fill = proportion)) +
  scale_fill_continuous(low = "white", high = "blue", limits=c(0,35)) +
  scale_x_discrete(position = "top") +
  geom_text(aes(label = labels)) +
  theme(axis.text=element_text(size=12, face="bold"))